1 /*
2 * Copyright (C) 2008 The Guava Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package com.google.common.net;
18
19 import static com.google.common.base.Preconditions.checkNotNull;
20
21 import com.google.common.annotations.Beta;
22 import com.google.common.annotations.GwtCompatible;
23 import com.google.common.escape.UnicodeEscaper;
24
25 /**
26 * A {@code UnicodeEscaper} that escapes some set of Java characters using a
27 * UTF-8 based percent encoding scheme. The set of safe characters (those which
28 * remain unescaped) can be specified on construction.
29 *
30 * <p>This class is primarily used for creating URI escapers in {@link
31 * UrlEscapers} but can be used directly if required. While URI escapers impose
32 * specific semantics on which characters are considered 'safe', this class has
33 * a minimal set of restrictions.
34 *
35 * <p>When escaping a String, the following rules apply:
36 * <ul>
37 * <li>All specified safe characters remain unchanged.
38 * <li>If {@code plusForSpace} was specified, the space character " " is
39 * converted into a plus sign {@code "+"}.
40 * <li>All other characters are converted into one or more bytes using UTF-8
41 * encoding and each byte is then represented by the 3-character string
42 * "%XX", where "XX" is the two-digit, uppercase, hexadecimal representation
43 * of the byte value.
44 * </ul>
45 *
46 * <p>For performance reasons the only currently supported character encoding of
47 * this class is UTF-8.
48 *
49 * <p><b>Note:</b> This escaper produces uppercase hexadecimal sequences. From
50 * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br>
51 * <i>"URI producers and normalizers should use uppercase hexadecimal digits
52 * for all percent-encodings."</i>
53 *
54 * @author David Beaumont
55 * @since 15.0
56 */
57 @Beta
58 @GwtCompatible
59 public final class PercentEscaper extends UnicodeEscaper {
60
61 // In some escapers spaces are escaped to '+'
62 private static final char[] PLUS_SIGN = { '+' };
63
64 // Percent escapers output upper case hex digits (uri escapers require this).
65 private static final char[] UPPER_HEX_DIGITS =
66 "0123456789ABCDEF".toCharArray();
67
68 /**
69 * If true we should convert space to the {@code +} character.
70 */
71 private final boolean plusForSpace;
72
73 /**
74 * An array of flags where for any {@code char c} if {@code safeOctets[c]} is
75 * true then {@code c} should remain unmodified in the output. If
76 * {@code c > safeOctets.length} then it should be escaped.
77 */
78 private final boolean[] safeOctets;
79
80 /**
81 * Constructs a percent escaper with the specified safe characters and
82 * optional handling of the space character.
83 *
84 * <p>Not that it is allowed, but not necessarily desirable to specify {@code %}
85 * as a safe character. This has the effect of creating an escaper which has no
86 * well defined inverse but it can be useful when escaping additional characters.
87 *
88 * @param safeChars a non null string specifying additional safe characters
89 * for this escaper (the ranges 0..9, a..z and A..Z are always safe and
90 * should not be specified here)
91 * @param plusForSpace true if ASCII space should be escaped to {@code +}
92 * rather than {@code %20}
93 * @throws IllegalArgumentException if any of the parameters were invalid
94 */
95 public PercentEscaper(String safeChars, boolean plusForSpace) {
96 // TODO(user): Switch to static factory methods for creation now that class is final.
97 // TODO(user): Support escapers where alphanumeric chars are not safe.
98 checkNotNull(safeChars); // eager for GWT.
99 // Avoid any misunderstandings about the behavior of this escaper
100 if (safeChars.matches(".*[0-9A-Za-z].*")) {
101 throw new IllegalArgumentException(
102 "Alphanumeric characters are always 'safe' and should not be " +
103 "explicitly specified");
104 }
105 safeChars += "abcdefghijklmnopqrstuvwxyz" +
106 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
107 "0123456789";
108 // Avoid ambiguous parameters. Safe characters are never modified so if
109 // space is a safe character then setting plusForSpace is meaningless.
110 if (plusForSpace && safeChars.contains(" ")) {
111 throw new IllegalArgumentException(
112 "plusForSpace cannot be specified when space is a 'safe' character");
113 }
114 this.plusForSpace = plusForSpace;
115 this.safeOctets = createSafeOctets(safeChars);
116 }
117
118 /**
119 * Creates a boolean array with entries corresponding to the character values
120 * specified in safeChars set to true. The array is as small as is required to
121 * hold the given character information.
122 */
123 private static boolean[] createSafeOctets(String safeChars) {
124 int maxChar = -1;
125 char[] safeCharArray = safeChars.toCharArray();
126 for (char c : safeCharArray) {
127 maxChar = Math.max(c, maxChar);
128 }
129 boolean[] octets = new boolean[maxChar + 1];
130 for (char c : safeCharArray) {
131 octets[c] = true;
132 }
133 return octets;
134 }
135
136 /*
137 * Overridden for performance. For unescaped strings this improved the
138 * performance of the uri escaper from ~760ns to ~400ns as measured by
139 * {@link CharEscapersBenchmark}.
140 */
141 @Override
142 protected int nextEscapeIndex(CharSequence csq, int index, int end) {
143 checkNotNull(csq);
144 for (; index < end; index++) {
145 char c = csq.charAt(index);
146 if (c >= safeOctets.length || !safeOctets[c]) {
147 break;
148 }
149 }
150 return index;
151 }
152
153 /*
154 * Overridden for performance. For unescaped strings this improved the
155 * performance of the uri escaper from ~400ns to ~170ns as measured by
156 * {@link CharEscapersBenchmark}.
157 */
158 @Override
159 public String escape(String s) {
160 checkNotNull(s);
161 int slen = s.length();
162 for (int index = 0; index < slen; index++) {
163 char c = s.charAt(index);
164 if (c >= safeOctets.length || !safeOctets[c]) {
165 return escapeSlow(s, index);
166 }
167 }
168 return s;
169 }
170
171 /**
172 * Escapes the given Unicode code point in UTF-8.
173 */
174 @Override
175 protected char[] escape(int cp) {
176 // We should never get negative values here but if we do it will throw an
177 // IndexOutOfBoundsException, so at least it will get spotted.
178 if (cp < safeOctets.length && safeOctets[cp]) {
179 return null;
180 } else if (cp == ' ' && plusForSpace) {
181 return PLUS_SIGN;
182 } else if (cp <= 0x7F) {
183 // Single byte UTF-8 characters
184 // Start with "%--" and fill in the blanks
185 char[] dest = new char[3];
186 dest[0] = '%';
187 dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
188 dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
189 return dest;
190 } else if (cp <= 0x7ff) {
191 // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
192 // Start with "%--%--" and fill in the blanks
193 char[] dest = new char[6];
194 dest[0] = '%';
195 dest[3] = '%';
196 dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
197 cp >>>= 4;
198 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
199 cp >>>= 2;
200 dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
201 cp >>>= 4;
202 dest[1] = UPPER_HEX_DIGITS[0xC | cp];
203 return dest;
204 } else if (cp <= 0xffff) {
205 // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
206 // Start with "%E-%--%--" and fill in the blanks
207 char[] dest = new char[9];
208 dest[0] = '%';
209 dest[1] = 'E';
210 dest[3] = '%';
211 dest[6] = '%';
212 dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
213 cp >>>= 4;
214 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
215 cp >>>= 2;
216 dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
217 cp >>>= 4;
218 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
219 cp >>>= 2;
220 dest[2] = UPPER_HEX_DIGITS[cp];
221 return dest;
222 } else if (cp <= 0x10ffff) {
223 char[] dest = new char[12];
224 // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
225 // Start with "%F-%--%--%--" and fill in the blanks
226 dest[0] = '%';
227 dest[1] = 'F';
228 dest[3] = '%';
229 dest[6] = '%';
230 dest[9] = '%';
231 dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
232 cp >>>= 4;
233 dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
234 cp >>>= 2;
235 dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
236 cp >>>= 4;
237 dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
238 cp >>>= 2;
239 dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
240 cp >>>= 4;
241 dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
242 cp >>>= 2;
243 dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
244 return dest;
245 } else {
246 // If this ever happens it is due to bug in UnicodeEscaper, not bad input.
247 throw new IllegalArgumentException(
248 "Invalid unicode character value " + cp);
249 }
250 }
251 }